# Basic
import re
import numpy as np
import pandas as pd
# Plotting
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Machine Learning
from sklearn.model_selection import train_test_split
# Text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
# Other
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import plotly
import plotly.io as pio
plotly.offline.init_notebook_mode(connected=True)
pio.renderers.default='notebook'
data = pd.read_csv('../data/Kaggle_MBTI.csv')
data.shape
(8675, 2)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8675 entries, 0 to 8674 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 type 8675 non-null object 1 posts 8675 non-null object dtypes: object(2) memory usage: 135.7+ KB
data.head()
| type | posts | |
|---|---|---|
| 0 | INFJ | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... |
| 1 | ENTP | 'I'm finding the lack of me in these posts ver... |
| 2 | INTP | 'Good one _____ https://www.youtube.com/wat... |
| 3 | INTJ | 'Dear INTP, I enjoyed our conversation the o... |
| 4 | ENTJ | 'You're fired.|||That's another silly misconce... |
# Show some posts of the first user
user1post = [post.split('|||') for post in data.head(1).posts.values] # list
user1post_random = pd.DataFrame(
data=np.array(user1post).transpose(),
columns=['post']).sample(10)
user1post_random.style.set_table_styles([
{'selector': 'th', 'props': [('text-align', 'left')]},
{'selector': 'td', 'props': [('text-align', 'left')]}
], overwrite=False)
| post | |
|---|---|
| 27 | http://www.youtube.com/watch?v=4V2uYORhQOk |
| 16 | It appears to be too late. :sad: |
| 19 | I just cherish the time of solitude b/c i revel within my inner world more whereas most other time i'd be workin... just enjoy the me time while you can. Don't worry, people will always be around to... |
| 32 | Banned for a whole host of reasons! |
| 44 | http://www.youtube.com/watch?v=w8IgImn57aQ |
| 47 | I failed a public speaking class a few years ago and I've sort of learned what I could do better were I to be in that position again. A big part of my failure was just overloading myself with too... |
| 2 | enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks |
| 10 | http://playeressence.com/wp-content/uploads/2013/08/RED-red-the-pokemon-master-32560474-450-338.jpg Game. Set. Match. |
| 26 | http://www.youtube.com/watch?v=Mw7eoU3BMbE |
| 5 | May the PerC Experience immerse you. |
# There are many records contains URL
search_URL = data.loc[data['posts'].str.contains("www", case=True)]
search_URL
| type | posts | |
|---|---|---|
| 0 | INFJ | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... |
| 1 | ENTP | 'I'm finding the lack of me in these posts ver... |
| 2 | INTP | 'Good one _____ https://www.youtube.com/wat... |
| 3 | INTJ | 'Dear INTP, I enjoyed our conversation the o... |
| 4 | ENTJ | 'You're fired.|||That's another silly misconce... |
| ... | ... | ... |
| 8667 | ENTP | 'I think generally people experience post trau... |
| 8669 | INFJ | 'I'm not sure about a method for picking out I... |
| 8670 | ISFP | 'https://www.youtube.com/watch?v=t8edHB_h908||... |
| 8672 | INTP | 'So many questions when i do these things. I ... |
| 8674 | INFP | 'It has been too long since I have been on per... |
4537 rows × 2 columns
color = px.colors.sequential.Sunset_r
df_count = data['type'].value_counts()
df_count = df_count.rename_axis('type').reset_index(name='counts')
fig = px.bar(df_count, x='type', y='counts',
color='type', color_discrete_sequence=color,
title='Type count',
width=1000,
height=600)
fig.show()
# Stratify split to ensure equal distribution of data
train_data, test_data = train_test_split(data,
test_size=0.2,
random_state=42,
stratify=data.type)
train_data
| type | posts | |
|---|---|---|
| 1228 | INFP | 'We are mandarin speakers. He receive educati... |
| 1290 | ISTP | 'Nope. Not now, not ever. I'm too busy with ... |
| 6756 | ENFJ | 'That's the only one I haven't gotten to read ... |
| 1662 | INFP | 'I used to think that maturity was burning bri... |
| 3338 | INFP | 'I get typed as both a 4w5 and 5w6 as well but... |
| ... | ... | ... |
| 7292 | INFP | Haven't posted here in a while. Here was my at... |
| 1086 | INFP | 'Ok, I'll go first. I'm a 29 year old INFP mal... |
| 7435 | ENTJ | 'I have dated a few INFJs, including my curren... |
| 1843 | INTP | 'People who are unable to replace social norms... |
| 2530 | ENTP | 'Yep! you're right! I agree with you!! i think... |
6940 rows × 2 columns
df_train_count = train_data['type'].value_counts().rename_axis('type').reset_index(name='counts')
df_test_count = test_data['type'].value_counts().rename_axis('type').reset_index(name='counts')
df_train_count
| type | counts | |
|---|---|---|
| 0 | INFP | 1465 |
| 1 | INFJ | 1176 |
| 2 | INTP | 1043 |
| 3 | INTJ | 873 |
| 4 | ENTP | 548 |
| 5 | ENFP | 540 |
| 6 | ISTP | 270 |
| 7 | ISFP | 217 |
| 8 | ENTJ | 185 |
| 9 | ISTJ | 164 |
| 10 | ENFJ | 152 |
| 11 | ISFJ | 133 |
| 12 | ESTP | 71 |
| 13 | ESFP | 38 |
| 14 | ESFJ | 34 |
| 15 | ESTJ | 31 |
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_train_count['type'], values=df_train_count['counts'], name='Train'),
1, 1)
fig.add_trace(go.Pie(labels=df_test_count['type'], values=df_test_count['counts'], name='Test'),
1, 2)
fig.update_traces(hole=.4, hovertemplate='Type: %{label}<br>Count: %{value}', marker_colors=color)
fig.update_layout(
title_text = "Train_Test Split",
annotations = [dict(text='Train', x=0.2, y=0.5, font_size=18, showarrow=False),
dict(text='Test', x=0.795, y=0.5, font_size=18, showarrow=False)])
fig.show()
# fig = px.pie(train_data, names='type', title='Train data', color_discrete_sequence=colors, hole=0.3, width=800, height=600 )
# fig.update_traces(textinfo='percent',hovertemplate='Type: %{label}<br>Count: %{value}')
# fig.show()
# fig = px.pie(test_data, names='type', title='Test data', color_discrete_sequence=colors, hole=0.3, width=800, height=600 )
# fig.update_traces(textinfo='percent',hovertemplate='Type: %{label}<br>Count: %{value}')
# fig.show()
Todo:
def getCleanPost(text):
text = re.sub(r'\|\|\|', r' ', text) # Remove splitter
text = re.sub(r'http\S+', r'URL', text) # Replace hyperlinks with "URL"
text = re.sub('[^0-9a-zA-Z]',' ', text) # Keep only words
text = re.sub(' +', ' ', text) # Remove redundant space
return text
'''
Example of getCleanPost.
input: Top 520 words in data.posts[0]
output: getCleanPost(input)
'''
origi_sentence = data.posts[0][0:520]
clean_sentence = getCleanPost(origi_sentence)
print('\033[96mBefore cleaning:\n',origi_sentence,'\n')
print('\033[94mAfter cleaning:\n',clean_sentence)
Before cleaning: 'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/watch?v=u8ejam5DP3E On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend pos After cleaning: URL URL enfp and intj moments URL sportscenter not top ten plays URL pranks What has been the most life changing experience in your life URL URL On repeat for most of today May the PerC Experience immerse you The last thing my INFJ friend pos
# Apply getCleanPost to all training data
train_data_copy = train_data.copy()
tqdm.pandas() # Progress bar
train_data_copy['posts_clean'] = train_data_copy['posts'].progress_apply(getCleanPost)
train_data_copy
0%| | 0/6940 [00:00<?, ?it/s]
| type | posts | posts_clean | |
|---|---|---|---|
| 1228 | INFP | 'We are mandarin speakers. He receive educati... | We are mandarin speakers He receive education... |
| 1290 | ISTP | 'Nope. Not now, not ever. I'm too busy with ... | Nope Not now not ever I m too busy with work ... |
| 6756 | ENFJ | 'That's the only one I haven't gotten to read ... | That s the only one I haven t gotten to read ... |
| 1662 | INFP | 'I used to think that maturity was burning bri... | I used to think that maturity was burning bri... |
| 3338 | INFP | 'I get typed as both a 4w5 and 5w6 as well but... | I get typed as both a 4w5 and 5w6 as well but... |
| ... | ... | ... | ... |
| 7292 | INFP | Haven't posted here in a while. Here was my at... | Haven t posted here in a while Here was my att... |
| 1086 | INFP | 'Ok, I'll go first. I'm a 29 year old INFP mal... | Ok I ll go first I m a 29 year old INFP male ... |
| 7435 | ENTJ | 'I have dated a few INFJs, including my curren... | I have dated a few INFJs including my current... |
| 1843 | INTP | 'People who are unable to replace social norms... | People who are unable to replace social norms... |
| 2530 | ENTP | 'Yep! you're right! I agree with you!! i think... | Yep you re right I agree with you i think see... |
6940 rows × 3 columns
# Stop word list
stop_words = stopwords.words('english')
print('Stop words\n',stop_words)
Stop words ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
def getCleanToken(text):
# getCleanPost
text = re.sub(r'\|\|\|', r' ', text)
text = re.sub(r'http\S+', r'URL', text)
text = re.sub('[^0-9a-zA-Z]',' ', text)
text = re.sub(' +', ' ', text)
# Add "Tokenization" and remove stopword
text = text.lower()
tokens = word_tokenize(text)
filtered_tokens = [w for w in tokens if not w in stop_words]
return filtered_tokens
'''
Example of the added part in getCleanToken.
Referred to the paragraph # Add "Tokenization" and remove stopword
input: getCleanPost(user #1228)
output: getCleanToken(user #1228)
'''
clean_post = getCleanPost(train_data.posts[1228])
# Tokenization
tokens = word_tokenize(clean_post)
print(f'Original: {len(tokens)} tokens\n')
# Stop words
filtered_tokens = [w for w in tokens if not w in stop_words]
print(f'After removing stop words: {len(filtered_tokens)} tokens\n')
# Check removed words
print(f'Removed words: {list(set(tokens).difference(set(filtered_tokens)))}')
Original: 879 tokens After removing stop words: 508 tokens Removed words: ['up', 'which', 'each', 'and', 'will', 'about', 'only', 'him', 'no', 'yours', 'it', 'why', 'the', 'out', 'just', 'am', 's', 'from', 'who', 'or', 'with', 't', 'be', 'll', 'all', 'can', 'if', 'very', 'do', 'were', 'that', 'he', 'what', 'then', 'has', 'more', 'my', 'they', 'at', 'of', 'few', 'while', 'your', 'other', 'have', 'o', 'we', 'm', 'them', 'where', 'for', 'is', 'those', 'you', 'to', 'did', 'any', 'don', 'her', 'an', 'some', 'here', 'in', 'won', 'too', 'as', 'me', 'his', 'so', 'now', 'are', 'this', 'their', 'once', 'a', 'on', 'not', 'how', 'when', 'because']
# Apply getCleanToken to all training data
tqdm.pandas() # Progress bar
train_data_copy['tokens_clean'] = train_data_copy['posts'].progress_apply(getCleanToken)
0%| | 0/6940 [00:00<?, ?it/s]
train_data_copy
| type | posts | posts_clean | tokens_clean | |
|---|---|---|---|---|
| 1228 | INFP | 'We are mandarin speakers. He receive educati... | We are mandarin speakers He receive education... | [mandarin, speakers, receive, education, canad... |
| 1290 | ISTP | 'Nope. Not now, not ever. I'm too busy with ... | Nope Not now not ever I m too busy with work ... | [nope, ever, busy, work, causes, adrenaline, r... |
| 6756 | ENFJ | 'That's the only one I haven't gotten to read ... | That s the only one I haven t gotten to read ... | [one, gotten, read, yet, might, pick, one, boo... |
| 1662 | INFP | 'I used to think that maturity was burning bri... | I used to think that maturity was burning bri... | [used, think, maturity, burning, bridges, with... |
| 3338 | INFP | 'I get typed as both a 4w5 and 5w6 as well but... | I get typed as both a 4w5 and 5w6 as well but... | [get, typed, 4w5, 5w6, well, like, consider, 4... |
| ... | ... | ... | ... | ... |
| 7292 | INFP | Haven't posted here in a while. Here was my at... | Haven t posted here in a while Here was my att... | [posted, attire, best, man, buddies, wedding, ... |
| 1086 | INFP | 'Ok, I'll go first. I'm a 29 year old INFP mal... | Ok I ll go first I m a 29 year old INFP male ... | [ok, go, first, 29, year, old, infp, male, int... |
| 7435 | ENTJ | 'I have dated a few INFJs, including my curren... | I have dated a few INFJs including my current... | [dated, infjs, including, current, partner, 6,... |
| 1843 | INTP | 'People who are unable to replace social norms... | People who are unable to replace social norms... | [people, unable, replace, social, norms, ratio... |
| 2530 | ENTP | 'Yep! you're right! I agree with you!! i think... | Yep you re right I agree with you i think see... | [yep, right, agree, think, seeking, pressure, ... |
6940 rows × 4 columns
# Statistics
train_data_copy['Words count after getCleanPost'] = train_data_copy['posts_clean'].apply(lambda n: len(n.split()))
train_data_copy['Words count after getCleanToken'] = train_data_copy['tokens_clean'].str.len()
train_data_copy
| type | posts | posts_clean | tokens_clean | Words count after getCleanPost | Words count after getCleanToken | |
|---|---|---|---|---|---|---|
| 1228 | INFP | 'We are mandarin speakers. He receive educati... | We are mandarin speakers He receive education... | [mandarin, speakers, receive, education, canad... | 879 | 444 |
| 1290 | ISTP | 'Nope. Not now, not ever. I'm too busy with ... | Nope Not now not ever I m too busy with work ... | [nope, ever, busy, work, causes, adrenaline, r... | 1299 | 648 |
| 6756 | ENFJ | 'That's the only one I haven't gotten to read ... | That s the only one I haven t gotten to read ... | [one, gotten, read, yet, might, pick, one, boo... | 1273 | 571 |
| 1662 | INFP | 'I used to think that maturity was burning bri... | I used to think that maturity was burning bri... | [used, think, maturity, burning, bridges, with... | 1479 | 678 |
| 3338 | INFP | 'I get typed as both a 4w5 and 5w6 as well but... | I get typed as both a 4w5 and 5w6 as well but... | [get, typed, 4w5, 5w6, well, like, consider, 4... | 1142 | 522 |
| ... | ... | ... | ... | ... | ... | ... |
| 7292 | INFP | Haven't posted here in a while. Here was my at... | Haven t posted here in a while Here was my att... | [posted, attire, best, man, buddies, wedding, ... | 653 | 331 |
| 1086 | INFP | 'Ok, I'll go first. I'm a 29 year old INFP mal... | Ok I ll go first I m a 29 year old INFP male ... | [ok, go, first, 29, year, old, infp, male, int... | 1086 | 535 |
| 7435 | ENTJ | 'I have dated a few INFJs, including my curren... | I have dated a few INFJs including my current... | [dated, infjs, including, current, partner, 6,... | 1367 | 683 |
| 1843 | INTP | 'People who are unable to replace social norms... | People who are unable to replace social norms... | [people, unable, replace, social, norms, ratio... | 720 | 332 |
| 2530 | ENTP | 'Yep! you're right! I agree with you!! i think... | Yep you re right I agree with you i think see... | [yep, right, agree, think, seeking, pressure, ... | 1751 | 787 |
6940 rows × 6 columns
train_data_copy.describe()
| Words count after getCleanPost | Words count after getCleanToken | |
|---|---|---|
| count | 6940.000000 | 6940.000000 |
| mean | 1320.436888 | 618.429251 |
| std | 325.409982 | 140.097720 |
| min | 5.000000 | 4.000000 |
| 25% | 1132.000000 | 539.000000 |
| 50% | 1374.000000 | 643.000000 |
| 75% | 1561.000000 | 721.000000 |
| max | 1998.000000 | 927.000000 |
Preprocessor()
def Preprocessor(text, stemmer='Snowball'):
# getCleanToken
text = re.sub(r'\|\|\|', r' ', text)
text = re.sub(r'http\S+', r'URL', text)
text = re.sub('[^0-9a-zA-Z]',' ', text)
text = re.sub(' +', ' ', text)
text = text.lower()
tokens = word_tokenize(text)
filtered_tokens = [w for w in tokens if not w in stop_words]
# Add "Stemming" and "Lemmatization"
stemmer = SnowballStemmer("english") # Initiate
lemma = WordNetLemmatizer() # Initiate
stemmed = [stemmer.stem(t) for t in filtered_tokens] # Stemming
lemmatized = [lemma.lemmatize(t) for t in stemmed] # Lemmatization
if stemmer == 'Porter':
stemmer = PorterStemmer()
stemmed = [stemmer_ps.stem(t) for t in filtered_tokens]
lemmatized = [lemma.lemmatize(t) for t in stemmed]
return lemmatized
'''
Example of the added part in Preprocessor.
Referred to the paragraph: # Add "Stemming" and "Lemmatization"
input: getCleanToken(user #1228)
output: Preprocessor(user #1228)
'''
clean_token = getCleanToken(train_data.posts[1228])
# Initiate
stemmer_ps = PorterStemmer()
stemmer_ss = SnowballStemmer("english")
lemma = WordNetLemmatizer()
# Stemming
stemmed_ps = [stemmer_ps.stem(t) for t in clean_token]
stemmed_ss = [stemmer_ss.stem(t) for t in clean_token]
# Lemmatizing
lemmatized_ps = [lemma.lemmatize(t) for t in stemmed_ps]
lemmatized_ss = [lemma.lemmatize(t) for t in stemmed_ss]
# Compare different Stemmer and Lemmatizer, which 'stle' stands for.
df_stle = pd.DataFrame(
list(zip(clean_token, stemmed_ps, stemmed_ss, lemmatized_ps, lemmatized_ss)),
columns =['Original(clean_token)', 'PorterStemmer', 'SnowballStemmer', 'Lemma with PorterStemmer', 'Lemma with SnowballStemmer'])
df_stle.head(10)
| Original(clean_token) | PorterStemmer | SnowballStemmer | Lemma with PorterStemmer | Lemma with SnowballStemmer | |
|---|---|---|---|---|---|
| 0 | mandarin | mandarin | mandarin | mandarin | mandarin |
| 1 | speakers | speaker | speaker | speaker | speaker |
| 2 | receive | receiv | receiv | receiv | receiv |
| 3 | education | educ | educ | educ | educ |
| 4 | canada | canada | canada | canada | canada |
| 5 | since | sinc | sinc | sinc | sinc |
| 6 | 13 | 13 | 13 | 13 | 13 |
| 7 | thanks | thank | thank | thank | thank |
| 8 | bellisaurius | bellisauriu | bellisaurius | bellisauriu | bellisaurius |
| 9 | appreciate | appreci | appreci | appreci | appreci |
diff_result = df_stle.query('PorterStemmer != SnowballStemmer')
print(f'The PorterStemmer and SnowballStemmer has\
{diff_result.shape[0]} / {df_stle.shape[0]}\
different tokens in user #1228\'s posts.')
diff_result
The PorterStemmer and SnowballStemmer has 15 / 444 different tokens in user #1228's posts.
| Original(clean_token) | PorterStemmer | SnowballStemmer | Lemma with PorterStemmer | Lemma with SnowballStemmer | |
|---|---|---|---|---|---|
| 8 | bellisaurius | bellisauriu | bellisaurius | bellisauriu | bellisaurius |
| 10 | kindly | kindli | kind | kindli | kind |
| 41 | yes | ye | yes | ye | yes |
| 46 | yes | ye | yes | ye | yes |
| 157 | yes | ye | yes | ye | yes |
| 161 | saurus | sauru | saurus | sauru | saurus |
| 291 | dos | do | dos | do | do |
| 304 | pros | pro | pros | pro | pro |
| 318 | exactly | exactli | exact | exactli | exact |
| 382 | dos | do | dos | do | do |
| 387 | dos | do | dos | do | do |
| 399 | dos | do | dos | do | do |
| 409 | dos | do | dos | do | do |
| 414 | dos | do | dos | do | do |
| 422 | communication | commun | communic | commun | communic |
# Apply Preprocessor to all training data
tqdm.pandas() # Progress bar
train_data_copy['preprocessed'] = train_data_copy['posts'].progress_apply(Preprocessor)
0%| | 0/6940 [00:00<?, ?it/s]
train_data_copy.drop(train_data_copy.columns[[4,5]],axis = 1)
| type | posts | posts_clean | tokens_clean | preprocessed | |
|---|---|---|---|---|---|
| 1228 | INFP | 'We are mandarin speakers. He receive educati... | We are mandarin speakers He receive education... | [mandarin, speakers, receive, education, canad... | [mandarin, speaker, receiv, educ, canada, sinc... |
| 1290 | ISTP | 'Nope. Not now, not ever. I'm too busy with ... | Nope Not now not ever I m too busy with work ... | [nope, ever, busy, work, causes, adrenaline, r... | [nope, ever, busi, work, caus, adrenalin, rush... |
| 6756 | ENFJ | 'That's the only one I haven't gotten to read ... | That s the only one I haven t gotten to read ... | [one, gotten, read, yet, might, pick, one, boo... | [one, gotten, read, yet, might, pick, one, boo... |
| 1662 | INFP | 'I used to think that maturity was burning bri... | I used to think that maturity was burning bri... | [used, think, maturity, burning, bridges, with... | [use, think, matur, burn, bridg, without, seco... |
| 3338 | INFP | 'I get typed as both a 4w5 and 5w6 as well but... | I get typed as both a 4w5 and 5w6 as well but... | [get, typed, 4w5, 5w6, well, like, consider, 4... | [get, type, 4w5, 5w6, well, like, consid, 4w5,... |
| ... | ... | ... | ... | ... | ... |
| 7292 | INFP | Haven't posted here in a while. Here was my at... | Haven t posted here in a while Here was my att... | [posted, attire, best, man, buddies, wedding, ... | [post, attir, best, man, buddi, wed, 698410, u... |
| 1086 | INFP | 'Ok, I'll go first. I'm a 29 year old INFP mal... | Ok I ll go first I m a 29 year old INFP male ... | [ok, go, first, 29, year, old, infp, male, int... | [ok, go, first, 29, year, old, infp, male, int... |
| 7435 | ENTJ | 'I have dated a few INFJs, including my curren... | I have dated a few INFJs including my current... | [dated, infjs, including, current, partner, 6,... | [date, infj, includ, current, partner, 6, year... |
| 1843 | INTP | 'People who are unable to replace social norms... | People who are unable to replace social norms... | [people, unable, replace, social, norms, ratio... | [peopl, unabl, replac, social, norm, ration, e... |
| 2530 | ENTP | 'Yep! you're right! I agree with you!! i think... | Yep you re right I agree with you i think see... | [yep, right, agree, think, seeking, pressure, ... | [yep, right, agre, think, seek, pressur, relev... |
6940 rows × 5 columns
print(f'Input (800 words):\n{train_data.posts[0][:800]}...')
Input (800 words): 'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/watch?v=u8ejam5DP3E On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~ http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times ...
print(f'Output:\n{Preprocessor(train_data.posts[0])}')
Output: ['url', 'url', 'enfp', 'intj', 'moment', 'url', 'sportscent', 'top', 'ten', 'play', 'url', 'prank', 'life', 'chang', 'experi', 'life', 'url', 'url', 'repeat', 'today', 'may', 'perc', 'experi', 'immers', 'last', 'thing', 'infj', 'friend', 'post', 'facebook', 'commit', 'suicid', 'next', 'day', 'rest', 'peac', 'url', 'hello', 'enfj7', 'sorri', 'hear', 'distress', 'natur', 'relationship', 'perfect', 'time', 'everi', 'moment', 'exist', 'tri', 'figur', 'hard', 'time', 'time', 'growth', '84389', '84390', 'url', 'url', 'welcom', 'stuff', 'url', 'game', 'set', 'match', 'prozac', 'wellbrutin', 'least', 'thirti', 'minut', 'move', 'leg', 'mean', 'move', 'sit', 'desk', 'chair', 'weed', 'moder', 'mayb', 'tri', 'edibl', 'healthier', 'altern', 'basic', 'come', 'three', 'item', 'determin', 'type', 'whichev', 'type', 'want', 'would', 'like', 'use', 'given', 'type', 'cognit', 'function', 'whatnot', 'left', 'thing', 'moder', 'sim', 'inde', 'video', 'game', 'good', 'one', 'note', 'good', 'one', 'somewhat', 'subject', 'complet', 'promot', 'death', 'given', 'sim', 'dear', 'enfp', 'favorit', 'video', 'game', 'grow', 'current', 'favorit', 'video', 'game', 'cool', 'url', 'appear', 'late', 'sad', 'someon', 'everyon', 'wait', 'thought', 'confid', 'good', 'thing', 'cherish', 'time', 'solitud', 'b', 'c', 'revel', 'within', 'inner', 'world', 'wherea', 'time', 'workin', 'enjoy', 'time', 'worri', 'peopl', 'alway', 'around', 'yo', 'entp', 'ladi', 'complimentari', 'person', 'well', 'hey', 'main', 'social', 'outlet', 'xbox', 'live', 'convers', 'even', 'verbal', 'fatigu', 'quick', 'url', 'realli', 'dig', 'part', '1', '46', '2', '50', 'url', 'ban', 'thread', 'requir', 'get', 'high', 'backyard', 'roast', 'eat', 'marshmellow', 'backyard', 'convers', 'someth', 'intellectu', 'follow', 'massag', 'kiss', 'url', 'url', 'url', 'ban', 'mani', 'b', 'sentenc', 'could', 'think', 'b', 'ban', 'watch', 'movi', 'corner', 'dunc', 'ban', 'health', 'class', 'clear', 'taught', 'noth', 'peer', 'pressur', 'ban', 'whole', 'host', 'reason', 'url', '1', 'two', 'babi', 'deer', 'left', 'right', 'munch', 'beetl', 'middl', '2', 'use', 'blood', 'two', 'caveman', 'diari', 'today', 'latest', 'happen', 'design', 'cave', 'diari', 'wall', '3', 'see', 'pokemon', 'world', 'infj', 'societi', 'everyon', 'becom', 'optimist', '49142', 'url', 'url', 'url', 'url', 'artist', 'artist', 'draw', 'idea', 'count', 'form', 'someth', 'like', 'signatur', 'welcom', 'robot', 'rank', 'person', 'down', 'self', 'esteem', 'cuz', 'avid', 'signatur', 'artist', 'like', 'proud', 'ban', 'take', 'room', 'bed', 'ya', 'got', 'ta', 'learn', 'share', 'roach', 'url', 'ban', 'much', 'thunder', 'grumbl', 'kind', 'storm', 'yep', 'ahh', 'old', 'high', 'school', 'music', 'heard', 'age', 'url', 'fail', 'public', 'speak', 'class', 'year', 'ago', 'sort', 'learn', 'could', 'better', 'posit', 'big', 'part', 'failur', 'overload', 'like', 'person', 'mental', 'confirm', 'intj', 'way', 'url', 'move', 'denver', 'area', 'start', 'new', 'life']